#rStudio 3.2.4 #Internet access to download DropBox hosted csv file [128MB] #Folder ‘sfzipcodes’ located in RMD’s working directory #https://github.com/avennu01/three-amigos/tree/master/sfzipcodes
## dat_crime <- read.csv(unz("https://github.com/avennu01/three-amigos/blob/master/train.csv.zip","train.csv")) .. unable to direclty read data from GitHub
dat_crime <- read.csv("https://www.dropbox.com/s/kjkt5ndf3jkibq4/train.csv?raw=1")
#table(dat_crime$Category)
# seperating Dates in to Date & Time and selecting below categories only, rest have been eliminated based on group consensus since they dint have impact on our end objective
# Missing Person .. Need more deepdive
dat_crimeNew <- dat_crime %>% separate(col = Dates, into = c("Date","Time"), sep = " ",fill = "right" ) %>% filter(Category %in% c("ASSAULT","DISORDERLY CONDUCT","DRUNKENNESS","DRUG/NARCOTIC","KIDNAPPING","LARCENY/THEFT","LOITERING","MISSING PERSON","NON-CRIMINAL","ROBBERY","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUSPICIOUS OCC","VANDALISM","VEHICLE THEFT"))
dat_occ <- dat_crimeNew %>% group_by(Category,Descript) %>% summarize(occurence= n())
# Based of above table, below considerations have been made
# Descripts to consider apart from one's selected .... ASSAULT, BATTERY,STALKING,LOITERING,
# complete categories to consider "KIDNAPPING","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE"
# complete categories to ingore "VANDALISM","VEHICLE THEFT"
dat_crimeNew <- dat_crime %>% separate(col = Dates, into = c("Date","Time"), sep = " ",fill = "right" ) %>% filter(Category %in% c("ASSAULT","DISORDERLY CONDUCT","DRUNKENNESS","DRUG/NARCOTIC","KIDNAPPING","LARCENY/THEFT","LOITERING","MISSING PERSON","NON-CRIMINAL","ROBBERY","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUSPICIOUS OCC"))
dat_occ <- dat_crimeNew %>% group_by(Category,Descript) %>% summarize(occurence= n())
p <- dat_occ %>% spread(Descript,occurence)
final_desc <- c("AGGRAVATED ASSAULT WITH","ASSAULT, AGGRAVATED, W/","ATTEMPTED HOMICIDE WITH","ATTEMPTED MAYHEM WITH","ATTEMPTED SIMPLE","BATTERY WITH","MAYHEM WITH","THREATS AGAINST","WILLFUL CRUELTY","COMMITTING PUBLIC","DISTURBING THE PEAC","MAINTAINING A PUBLIC","FOR SALE","SALE OF","UNDER INFLUENCE","UNDER THE INFLUENCE","ATTEMPTED PETTY THEFT","GRAND THEFT PICK","GRAND THEFT PURSE","PETTY THEFT","THEFT, DRUNK ROLL,","AIDED CASE, DOG","AIDED CASE, INJURED","ASSAULT TO ROB WITH ","ATTEMPTED ROBBERY ON THE STREET","ATTEMPTED ROBBERY WITH","ROBBERY ON THE STREET ","ROBBERY, ARMED WITH A ","ROBBERY, BODILY","ASSAULT BY JUVENILE","SHOOTING BY JUVENILE","ANNOY OR MOLEST","STOLEN CELLULAR PHONE","STOLEN ELECTRONICS","SUSPICIOUS A","SUSPICIOUS OCCU","SUSPICIOUS PER","MISSING")
select_desc <- function(x) {
names(p %>% select(contains(x)))[-1]
}
dat_crimefinal <- full_join(dat_crimeNew %>% filter(Descript %in% unlist(lapply(final_desc,select_desc))),dat_crimeNew %>% filter(Category %in% c("KIDNAPPING","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE")))
# Reduction in observations
#dim(dat_crime)
#dim(dat_crimeNew)
#dim(dat_crimefinal)
paste("Original raw data set was ", nrow(dat_crime), " observances.")
## [1] "Original raw data set was 878049 observances."
paste("Original raw data set was BLA BLA", nrow(dat_crimeNew), " observances.")
## [1] "Original raw data set was BLA BLA 509681 observances."
paste("Lastly, all data not impacting a pedistrian was removed, leaving", nrow(dat_crimefinal), " observances.")
## [1] "Lastly, all data not impacting a pedistrian was removed, leaving 195711 observances."
# Based on the dat_occ_final we can categorise the level of serverity of crime if we want to show our results in such manner
dat_occ_final <- dat_crimefinal %>% group_by(Category,Descript) %>% summarize(occurence= n())
cat_life <- dat_crimefinal %>% filter(Category %in% c("ASSAULT","KIDNAPPING","MISSING PERSON","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","SUSPICIOUS OCC"))
cat_prop <- dat_crimefinal %>% filter(Category %in% c("STOLEN PROPERTY","ROBBERY","LARCENY/THEFT"))
cat_nui <- dat_crimefinal %>% filter(Category %in% c("DISORDERLY CONDUCT","DRUG/NARCOTIC","DRUNKENNESS","NON-CRIMINAL"))
##Clean up vectors no longer required
rm(dat_crime)
rm(dat_crimeNew)
rm(dat_occ)
#Assigning longitude and latitude to x and y
x <- dat_crimefinal$X
y <- dat_crimefinal$Y
#Creating base map of San Francisco using ggmap
map_SF <- get_map(location = "San Francisco", zoom = 12)
map <- ggmap(map_SF)
#Inspiration and ideas from the following sources
#https://rpubs.com/nickbearman/r-google-map-making
#http://www.r-bloggers.com/contour-and-density-layers-with-ggmap/
#https://rpubs.com/hegupta/151080
#Heat map using all data across all categories with zip code boundaries
W <- dat_crimefinal
plot<- ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = W, aes(x = x, y = y)) +
stat_density2d(data = W, aes(x = x, y = y, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "blue", high = "red") +
theme(legend.position = "none")+ ggtitle("All crime categories for wrangled data with zip code boundaries")
#Layer zip codes on above mao
#Zip <- readOGR(".","SFZipCodes")
Zip <- readOGR(dsn=path.expand("./sfzipcodes"), layer = "SFZipCodes")
## OGR data source with driver: ESRI Shapefile
## Source: "./sfzipcodes", layer: "SFZipCodes"
## with 27 features
## It has 11 fields
Zip <- spTransform(Zip, CRS("+proj=longlat +datum=WGS84"))
Zip <- fortify(Zip)
plot + geom_polygon(aes(x=long, y=lat, group=group), fill='grey', size=.2,color='blue', data=Zip, alpha=0.2)
#Heat map across category deemed as property
x1 <- cat_prop$X
y1 <- cat_prop$Y
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = cat_prop, aes(x = x1, y = y1)) +
stat_density2d(data = cat_prop, aes(x = x1, y = y1, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "green", high = "black") +
theme(legend.position = "none")+ ggtitle("Category Property Crimes")
#Heat map across category deemed as life
x2 <- cat_life$X
y2 <- cat_life$Y
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = cat_life, aes(x = x2, y = y2)) +
stat_density2d(data = cat_life, aes(x = x2, y = y2, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "yellow", high = "blue") +
theme(legend.position = "none")+ ggtitle("Category Life Crimes")
#Heat map across category deemed as nuisance
x3 <- cat_nui$X
y3 <- cat_nui$Y
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = cat_nui, aes(x = x3, y = y3)) +
stat_density2d(data = cat_nui, aes(x = x3, y = y3, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "yellow", high = "red") +
theme(legend.position = "none")+ ggtitle("Category Nuisance Crimes")
#Adding column "class" to dataset using nested if statement with mutate
dat <- dat_crimefinal %>% mutate(class = ifelse(Category %in% c("STOLEN PROPERTY","ROBBERY","LARCENY/THEFT"), "Prop", ifelse(Category %in% c("DISORDERLY CONDUCT","DRUG/NARCOTIC","DRUNKENNESS","NON-CRIMINAL"), "Nui", "Life")))
dat1 <- dat %>% group_by(Category) %>% summarize(occurence = n())
#Histogram by Category
qplot(Category, data = dat, geom = "bar", fill = Category) +
ggtitle("Crime Categories in San Francisco") +
theme(axis.text.x = element_blank())+
xlab("Category") +
ylab("Occurence")
#Histogram by district
qplot(dat$PdDistrict, data = dat, geom = "bar", fill = class) +
scale_x_discrete(label = abbreviate) +
ggtitle("Crime by district and class in San Francisco") +
xlab("District") +
ylab("Crimes by class")
#Further analysis of Southern District
#Daily average crime by class in southern district
dat2 <- dat %>% filter(PdDistrict == "SOUTHERN") %>% group_by(DayOfWeek, class) %>% summarize(DailyAvg = mean(n()))
ggplot(dat2, aes(x = dat2$DayOfWeek, y = dat2$DailyAvg, fill = factor(dat2$class) )) + geom_bar(stat="identity") + labs(x = "Day of Week", y = "Daily Average", title = "Daily Avg Crime by class for Southern District")
#Time of Day analysis for Southern District
dat3 <- dat %>% filter(PdDistrict=="SOUTHERN")
time <- as.data.frame(table(dat3$Time),stringsAsFactors = FALSE)
time$Hour <- substr(time$Var1, 1, 2)
TimeOfDay <- ggplot(time, aes(x = Hour, Freq))
TimeOfDay + stat_summary(fun.y = sum, geom = "bar", fill = "grey") +
labs(x = "Time of Day", y = "Occurrences", title ="Southern District crime timings")